Final project

import the raw data to R and combine the 151 patients

tidy the original data and put them together

Wrangling data and make them into the right format

#load packages#
library("dplyr")
library("reshape")
library("ggplot2")
library("dendextend")
library("colorspace")
library("dendextend")
library("plotly")
setwd("/Users/lyn/Final_Project")
samples1<-read.csv("/Users/lyn/Final_Project/Step3. Data analysis/patient_sample.csv", header = TRUE, sep = ",", quote = "\"", dec = ".", fill = TRUE, row.names = 2)
genes1<-read.csv("/Users/lyn/Final_Project/TCGA_AML_mRNA_data_for_analysis.csv", header = TRUE, sep = ",", quote = "\"", dec = ".", fill = TRUE, row.names = 2)
genes1<-genes1[,-1]
samples1<-samples1[,-1]

Including Plots

Hierarchical clustering

#Defined the bottom color of the graph#
APOC2<-genes1["ENSG00000234906",]
genes1["APOC2",]<-APOC2
quantile(APOC2)
##                 0%        25%       50%       75%     100%
## ENSG00000234906  0 0.05992213 0.1456956 0.4639374 8.759468
highexpression<-which(APOC2 > 0.4639374)
lowexpression<-which(APOC2 <= 0.05992213 )
mediumexpression<-which(APOC2<=0.4639374 & APOC2>0.05992213)
genes1["APOC2",]<-replace(genes1["APOC2",], (highexpression), "HIGH")
genes1["APOC2",]<-replace(genes1["APOC2",], (lowexpression), "LOW")
genes1["APOC2",]<-replace(genes1["APOC2",], (mediumexpression), "MEDIUM")

hierarchical clustering for the genes

genes1_transsample_all<-t(genes1)
colorCodes<- c(HIGH ="red", LOW = "green", MEDIUM = "orange")
groupCodes<- genes1_transsample_all[,"APOC2"]
clusters1 <- hclust(dist(genes1_transsample_all))
## Warning in dist(genes1_transsample_all): NAs introduced by coercion
dend <-as.dendrogram(clusters1)
dend <-rotate(dend, 1:93)
## Warning in weights_for_order[order_x[order]] <- weights: number of items to
## replace is not a multiple of replacement length
dend <-color_branches(dend, k=3)
par(cex=0.5)#reduces font#
labels_colors(dend)<- colorCodes[genes1_transsample_all[,"APOC2"]][order.dendrogram(dend)]
p<-plot(dend)

#add apoc2 expression levels and value to the table and color the bottom lable accordingly#
APOC2_expression_level<-as.data.frame(genes1_transsample_all[,"APOC2"])
colnames(APOC2_expression_level)<-"APOC2_levels"
APOC2_expression_value<-as.data.frame(genes1_transsample_all[,"ENSG00000234906"])
colnames(APOC2_expression_value)<-"APOC2_expression"
sample_apoc2_levels<-APOC2_expression_level
sample_apoc2_value<-APOC2_expression_value
samples1_apoc2_levels<-cbind(samples1,sample_apoc2_levels)
samples1<-cbind(samples1_apoc2_levels, sample_apoc2_value)

PCA

#need to reload the genes table to exculde the characters#
genes1<-read.csv("/Users/lyn/Final_Project/TCGA_AML_mRNA_data_for_analysis.csv", header = TRUE, sep = ",", quote = "\"", dec = ".", fill = TRUE, row.names = 2)
genes1<-genes1[,-1]
min(genes1[genes1>0])
## [1] 0.0005565771
genes1.log<-log2(genes1+0.0005565771)
#genes1.log.small <- genes1.log[seq(1, nrow(genes1.log), 20), ]
#pca1<- prcomp(genes1.log.small,center = TRUE,scale. = TRUE)
pca1<- prcomp(genes1.log,center = TRUE,scale. = TRUE)

make it an interactive 3D chart

pcadf1<-data.frame(pca1$rotation)
samples1$patients<-rownames(samples1)
pcadf1$patients<-rownames(pcadf1)
samples1_pca<-inner_join(samples1,pcadf1,by="patients")
pcaplot<-plot_ly(samples1_pca, x = ~PC1, y = ~PC2, z = ~PC3,marker = list(symbol = 'circle', sizemode = 'diameter'), sizes = ~APOC2_expression, color = ~RACE, colors = ("Spectral"))%>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'PC1'),
                      yaxis = list(title = 'PC2'),
                      zaxis = list(title = 'PC3')))
pcaplot
#conduct TTEST to find out significant genes#
highexpression_patients<-genes1[highexpression]
lowexpression_patients<-genes1[lowexpression]
lowexpression_patients_transpose<-t(lowexpression_patients)
highexpression_patients_transpose<-t(highexpression_patients)
tableforttest<-cbind(lowexpression_patients_transpose, highexpression_patients_transpose)

result<-array(0,dim=c(54713,2))
for (i in 1:54713){
  result[i,1]<-t.test(tableforttest[,i]-tableforttest[,i+54713])$statistic
  result[i,2]<-t.test(tableforttest[,i]-tableforttest[,i+54713])$p.value
}
#result
genesselected<-as.data.frame(result)
rownames(genesselected)<-rownames(genes1)
colnames(genesselected)<-c("statistic_value","p_value")

looking for genes that p-value<0.05

p_value<-genesselected[,2]
significantgenes<-which(p_value<0.05)
significantgenes_id<-rownames(genesselected[significantgenes,])
tableforpathwayanalysis<-genesselected[significantgenes_id,]
tableforpathwayanalysis1<-tableforpathwayanalysis[order(tableforpathwayanalysis[,"p_value"]),]

Make heatmap according to the ttest result

#choose number of genes that participate in the analysis, for example,1:100#
genes1_selected<-genes1[rownames(tableforpathwayanalysis1[1:30,]),]
genes1_selected1<-genes1_selected[,colnames(highexpression_patients)]
genes1_selected2<-genes1_selected[,colnames(lowexpression_patients)]
genes1_selected_table<-cbind(genes1_selected1, genes1_selected2)
genes1_selected_table_transpose<-t(genes1_selected_table)
#normalize the distribution of the dataset#
min(genes1_selected_table_transpose[genes1_selected_table_transpose>0])
## [1] 0.006068098
genes1_selected_table_transpose.log<-log2(genes1_selected_table_transpose+0.003231274)
median_value<-median(genes1_selected_table_transpose.log)
melted_corr <- melt(genes1_selected_table_transpose.log)
#draw heatmap#
p<-ggplot(data = melted_corr, mapping = aes(x = X1,
                                     y = X2,
                                     fill = value))+
  geom_tile() +
  xlab(label = "patients") +
  ylab(label = "Genes") +
  scale_fill_gradient2(low="green", mid="white", high="red", midpoint=median_value) +
  theme(axis.title.y = element_blank(),
        axis.text.y = element_text(size = 2.5, vjust = 0.5),
        axis.text.x = element_text(angle = 90, size = 5, vjust = 0.5))
ggplotly(p)
#use another way to draw heatmap#
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
min(genes1_selected_table[genes1_selected_table>0])
## [1] 0.006068098
genes1_selected_table.log<-log2(genes1_selected_table+0.006068098)
genes2<-as.matrix(sapply(genes1_selected_table, as.numeric))
rownames(genes2)<-rownames(genes1_selected_table)
colfunc<-colorRampPalette(c("green", "red"))
pc<-heatmap.2(genes2,col=colfunc(15), scale = "row", trace  = "none")